################################################################################################
## Replication File for 
## "Capturing Clicks: How the Chinese Government Uses Clickbait to Compete for Visibility"
## Yingdan Lu and Jennifer Pan
## April, 2020
################################################################################################

################################################################################################
## Descriptive statistics
################################################################################################


######### Set Up #########
#install.packages("dplyr")
library(dplyr) #version 0.8.1
#install.packages("readr")
library(readr) #version 1.3.1
#install.packages("lubridate")
library(lubridate) #version 1.7.4
#install.packages("ggplot2")
library(ggplot2) #version 3.2.1
#install.packages("extrafont")
library(extrafont) #version 0.17
#install.packages("xtable")
library(xtable) #version 1.8-4

#load fonts
loadfonts()

setwd("..")

# import the post data and account data
dt <- read.csv("data/total_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)
city_data <- read.csv("data/city_data.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)
nongov <- read.csv("data/nongov_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)

######### Figure 2 ###########
# construct the dataframe with date frequencies
tab <- as.data.frame(table(dt$date_pek), stringsAsFactors = F)
tab$date <- as.Date(tab$Var1)

# create a sequence of all dates from 2018-05-25 to 2019-05-25
x <- seq(from=as.Date('2018-05-25'),to=as.Date('2019-05-25'),by='1 day')

# construct a new dataframe with all dates and date frequency
a <- merge(data.frame(date=x),tab[,(2:3)], by='date',all.x=TRUE)

#code 0 for dates that have no post
a$Freq[is.na(a$Freq)] <- 0

#Plot the frequency by day
ggplot(a, aes(x=date, y=Freq)) + geom_line() + 
  theme_bw(base_size=16, base_family='Times New Roman')+
  xlab("2018-2019")+ylab("Number of posts")+
  geom_line(size = 1) + scale_x_date(date_labels = "%b", date_breaks = "1 month")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))


######### Appendix Figure A1 ###########
# accounts affiliated with the city propaganda department
nrow(city_data[city_data$Account_affiliation ==  3 | city_data$Account_affiliation ==  4,])
# accounts affiliated with the information office of the local government
nrow(city_data[city_data$Account_affiliation ==  2,])

# calculate the number of accounts existing more than one day in each year
city_data$registration <- mdy(city_data$registration)
date_store <- matrix(NA, 7, 2)
date_store[,1] <- seq(2013,2019)
date_store[1,2] <- table(city_data$registration < "2013-12-31")[2]
date_store[2,2] <- table(city_data$registration < "2014-12-31")[2]
date_store[3,2] <- table(city_data$registration < "2015-12-31")[2]
date_store[4,2] <- table(city_data$registration < "2016-12-31")[2]
date_store[5,2] <- table(city_data$registration < "2017-12-31")[2]
date_store[6,2] <- table(city_data$registration < "2018-12-31")[2]
date_store[7,2] <- table(city_data$registration < "2019-12-31")[1]
date_store <- as.data.frame(date_store)
colnames(date_store) <- c("begin_year", "city_accounts")

# plot the number of accounts by year
ggplot(data=date_store, aes(x=begin_year, y=city_accounts)) +
  geom_point(size = 3)+
  geom_line(size = 1)+
  geom_text(aes(label=city_accounts), hjust = 0.7, vjust=-1, size = 5, 
            family = "Times New Roman")+
  xlab("Year")+
  ylab("Number of accounts")+
  scale_x_continuous(breaks=seq(2013,2019))+
  ylim(0,250)+
  theme_bw(base_size=16, base_family='Times New Roman')+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(axis.title.x = element_text(size=16, colour = "black"), 
        text = element_text(size=16, colour = "black"),
        axis.text.x  = element_text(size=16, colour = "black"),
        axis.text.y = element_text(size=16, colour = "black"),
        axis.title.y = element_text(size=16, colour = "black"))


######### Appendix Table A2 ###########
# calculate the average reads and likes of posts by account
nongov$date_pek <- mdy(nongov$date_pek)
nongov_byaccount <- aggregate(nongov[,c("reads")], 
                              by = list(nongov$account_name),mean)
# add the column with number of posts by account
nongov_byaccount <- nongov %>% group_by(account_name) %>% 
  summarise(number_posts = n(), reads = mean(reads)) %>% 
  select(account_name, number_posts, reads)
nongov_byaccount$likes <- aggregate(nongov[nongov$date_pek >= as.Date("2019-03-14"),c("likes")], 
                                    by = list(nongov[nongov$date_pek >= as.Date("2019-03-14"),]$account_name),
                                    mean)[,2]
#output the results
xtable(nongov_byaccount)
